Updated: Sep 20, 2023 at 21:59:35 PDT.

Working directory: /home/groups/jgrimmer/trump_blocs.

Reads:

  • data/ipums-cps/cps_00008.csv

Writes:

  • data/cps_clean.rds

Setup

rm(list = ls())

library(fst)
library(dplyr)
library(data.table)

Load data

cps <- fread("data/ipums-cps/cps_00008.csv")
cps[ , source := "CPS"]

Recode variables

cps[ ,
     `:=`(
         year   = as.character(YEAR),
         cpsid  = CPSIDP,
         weight = VOSUPPWT,
         statefips = coler::lz_pad(STATEFIP, 2),
         citizen = ifelse(YEAR < 1994, 1L, as.numeric(CITIZEN != 5)),
         urban = recode(METRO, "1" = "rural", "2" = "urban", "3" = "suburban",
                        .default = NA_character_),
         race   = case_when(RACE %in%
                                 c(200, 801, 805:807, 810:811, 814, 816, 818) ~ "black",
                            RACE == 100                                       ~ "white",
                            TRUE                                              ~ "other"),
         age = as.integer(AGE),
         gender = recode(SEX, `1` = "male", `2` = "female",
                         .default = NA_character_),
         faminc = FAMINC,

         # https://cps.ipums.org/cps-action/downloads/extract_files/cps_00003.xml#EDUC
         educ = as.numeric(EDUC) %>%
         {
             case_when(. <= 73                        ~ "HS or less",
                       # Includes associate's degree
                       . %in% 74:110                       ~ "some college",
                       # Includes 4+ years of college
                       . %in% c(111:125) ~ "college")
         },

         region = REGION %>%
           {
            case_when(. %in% 11:13  ~ "Northeast",
                     . %in% 21:23  ~ "Midwest",
                     . %in% 31:34  ~ "South",
                     . %in% 41:43  ~ "West",
                     . %in% 91:99  ~ NA_character_)
           },

         voted = recode(VOTED,
                        "01" = 0L, "02" = 1L, .default = NA_integer_)
     )
     ] %>% 
  # filter to CVAP
  filter(citizen == 1, age >= 18)

Copied from Will’s CCES code: age groups in 10-year bands, except under 20 is counted w/ 20s

mfloor <- function(x,base){
    base * floor(x/base)
}

cps[ , age_bin := mfloor(age, 10)]
cps[age < 20, age_bin := 20]
cps[!is.na(faminc),
    # Matching ANES cut points
    `:=`(fincome = faminc,
         faminc_quin = ntile(faminc, 5) %>%
           recode_factor(`1` = "1st",
                         `2` = "2nd",
                         `3` = "3rd",
                         `4` = "4th",
                         `5` = "5th"
           ),
         faminc_terc =
           ntile(faminc, 3) %>%
           recode_factor(`1` = "1st", `2` = "2nd", `3` = "3rd")
    ), by = .(year, source)
    ]
cps[!is.na(voted) & !is.na(faminc_quin),
    stopifnot(.N > 2e4), by = year]

Hispanic origin

From CPS-IPUMS codebook:

99999999 = N.I.U. (Not in Universe). 1968-1975: -9999997 (Loss of $9999 or more dollars).

cps[HISPAN %in% 100:612 & race %in% c("white", "other"),
    race := "hispanic"]

cps[ , hisp_origin :=
         case_when(HISPAN %in% 100:109 ~ "mexican",
                   HISPAN == 200       ~ "puerto rican",
                   HISPAN == 300       ~ "cuban",
                   HISPAN %in% 600:612 ~ "other",
                   # Fewer than 2000 respondents total
                   # HISPAN == 400       ~ "dominican",
                   # HISPAN == 500       ~ "salvadoran",
                   TRUE ~ NA_character_)
     ]

Add South dummy

fips <- fastLink::statefips

setDT(fips, key = "statefips")

fips[ , south := as.numeric(state %in%
                                c("TN", "VA", "NC", "SC", "FL",
                                  "GA", "AL", "MS", "LA", "AR", "TX",
                                  # Schickler adds these
                                  "OK", "KY"))
      ]
cps[fips,
    c("south", "state") := .(i.south, i.state),
    on = "statefips"
    ]

Missing weights…

For some reason, CPS is missing weights before 1976.

cps[is.na(weight) & year < 1976, weight := 1]

Export

keep <- grep("[a-z]", names(cps), value = TRUE)
cps_out <- dplyr::select(cps, all_of(keep))
setDT(cps_out)

skimr::skim(cps_out)
## Warning: Couldn't find skimmers for class: integer64; No user-defined `sfl`
## provided. Falling back to `character`.
Data summary
Name cps_out
Number of rows 3151844
Number of columns 21
Key NULL
_______________________
Column type frequency:
character 10
factor 3
numeric 8
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
source 0 1.00 3 3 0 1 0
year 0 1.00 4 4 0 23 0
cpsid 0 1.00 1 21 0 3079423 0
statefips 0 1.00 2 2 0 51 0
urban 643853 0.80 5 8 0 3 0
race 0 1.00 5 8 0 4 0
gender 0 1.00 4 6 0 2 0
educ 72422 0.98 7 12 0 3 0
region 0 1.00 4 9 0 4 0
hisp_origin 2861170 0.09 5 12 0 4 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
faminc_quin 425102 0.87 FALSE 5 1st: 545357, 2nd: 545356, 3rd: 545347, 4th: 545344
faminc_terc 425102 0.87 FALSE 3 1st: 908921, 2nd: 908916, 3rd: 908905
state 0 1.00 FALSE 51 CA: 260400, NY: 173061, TX: 157626, FL: 128725

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
weight 0 1.00 1962.56 1146.47 0 1136.19 1795.92 2711.42 32493.72 ▇▁▁▁▁
citizen 0 1.00 0.97 0.18 0 1.00 1.00 1.00 1.00 ▁▁▁▁▇
age 0 1.00 36.18 22.46 0 17.00 34.00 53.00 99.00 ▇▇▇▅▁
faminc 425102 0.87 672.50 231.86 100 540.00 730.00 840.00 999.00 ▂▂▃▅▇
voted 1143378 0.64 0.62 0.49 0 0.00 1.00 1.00 1.00 ▅▁▁▁▇
age_bin 0 1.00 36.03 17.90 20 20.00 30.00 50.00 90.00 ▇▂▃▁▁
fincome 425102 0.87 672.50 231.86 100 540.00 730.00 840.00 999.00 ▂▂▃▅▇
south 0 1.00 0.26 0.44 0 0.00 0.00 1.00 1.00 ▇▁▁▁▃
write_fst(cps_out, "data/cps_clean.fst")